{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "pandas中的pd.read_html()这个函数，功能非常强大，可以轻松实现抓取Table表格型数据。无需掌握正则表达式或者xpath等工具，短短的几行代码就可以将网页数据抓取下来。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## pd.read_html语法及参数\n",
    "\n",
    "pandas.read_html(io, match='.+', flavor=None, \n",
    "header=None,index_col=None,skiprows=None, \n",
    "attrs=None, parse_dates=False, thousands=', ', \n",
    "encoding=None, decimal='.', converters=None, na_values=None, \n",
    "keep_default_na=True, displayed_only=True）\n",
    "\n",
    "- io ：接收网址、文件、字符串；\n",
    "- parse_dates：解析日期；\n",
    "- flavor：解析器；\n",
    "- header：标题行；\n",
    "- skiprows：跳过的行；\n",
    "- attrs：属性，比如 attrs = {'id': 'table'}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 抓取世界大学排名（1页数据）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-03-27T14:53:47.043331Z",
     "start_time": "2021-03-27T14:53:45.520945Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Ranking</th>\n",
       "      <th>University Name</th>\n",
       "      <th>Country/Region</th>\n",
       "      <th>Academic Reputation</th>\n",
       "      <th>Employer Reputation</th>\n",
       "      <th>Faculty Student</th>\n",
       "      <th>International Faculty</th>\n",
       "      <th>International Students</th>\n",
       "      <th>Citations per Faculty</th>\n",
       "      <th>Overall Score</th>\n",
       "      <th>Free</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>麻省理工学院Massachusetts Institute of Technology (MIT)</td>\n",
       "      <td>United Sta</td>\n",
       "      <td>100.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>100</td>\n",
       "      <td>91.9</td>\n",
       "      <td>99.1</td>\n",
       "      <td>100.0</td>\n",
       "      <td>免费评估</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>斯坦福大学Stanford University</td>\n",
       "      <td>United Sta</td>\n",
       "      <td>100.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>99.7</td>\n",
       "      <td>63.6</td>\n",
       "      <td>98.1</td>\n",
       "      <td>98.4</td>\n",
       "      <td>免费评估</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>哈佛大学Harvard University</td>\n",
       "      <td>United Sta</td>\n",
       "      <td>100.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>98.6</td>\n",
       "      <td>85.2</td>\n",
       "      <td>69.9</td>\n",
       "      <td>99.1</td>\n",
       "      <td>97.9</td>\n",
       "      <td>免费评估</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>加州理工学院California Institute of Technology (Calt...</td>\n",
       "      <td>United Sta</td>\n",
       "      <td>97.0</td>\n",
       "      <td>82.8</td>\n",
       "      <td>100.0</td>\n",
       "      <td>100</td>\n",
       "      <td>88.2</td>\n",
       "      <td>99.9</td>\n",
       "      <td>97.0</td>\n",
       "      <td>免费评估</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>牛津大学University of Oxford</td>\n",
       "      <td>United Kin</td>\n",
       "      <td>100.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>99.4</td>\n",
       "      <td>98.3</td>\n",
       "      <td>81.3</td>\n",
       "      <td>96.7</td>\n",
       "      <td>免费评估</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496</th>\n",
       "      <td>493</td>\n",
       "      <td>吉林大学Jilin University</td>\n",
       "      <td>China (Mai</td>\n",
       "      <td>14.9</td>\n",
       "      <td>23.1</td>\n",
       "      <td>49.9</td>\n",
       "      <td>12.4</td>\n",
       "      <td>3.2</td>\n",
       "      <td>25.2</td>\n",
       "      <td>24.2</td>\n",
       "      <td>免费评估</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>497</th>\n",
       "      <td>498</td>\n",
       "      <td>哈瓦那大学Universidad de La Habana</td>\n",
       "      <td>Cuba</td>\n",
       "      <td>16.0</td>\n",
       "      <td>25.8</td>\n",
       "      <td>70.5</td>\n",
       "      <td>2.4</td>\n",
       "      <td>5.5</td>\n",
       "      <td>2.4</td>\n",
       "      <td>24.1</td>\n",
       "      <td>免费评估</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>498</th>\n",
       "      <td>499</td>\n",
       "      <td>阿尔卡拉大学niversidad de Alcalá</td>\n",
       "      <td>Spain</td>\n",
       "      <td>9.8</td>\n",
       "      <td>13.2</td>\n",
       "      <td>58.6</td>\n",
       "      <td>22.4</td>\n",
       "      <td>70.6</td>\n",
       "      <td>10.6</td>\n",
       "      <td>23.9</td>\n",
       "      <td>免费评估</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499</th>\n",
       "      <td>499</td>\n",
       "      <td>东芬兰大学University of Eastern Finland</td>\n",
       "      <td>Finland</td>\n",
       "      <td>14.7</td>\n",
       "      <td>9.0</td>\n",
       "      <td>42.9</td>\n",
       "      <td>25.8</td>\n",
       "      <td>7.3</td>\n",
       "      <td>33.5</td>\n",
       "      <td>23.9</td>\n",
       "      <td>免费评估</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>500</th>\n",
       "      <td>499</td>\n",
       "      <td>俄勒冈州立大学Oregon State University</td>\n",
       "      <td>United Sta</td>\n",
       "      <td>17.8</td>\n",
       "      <td>7.3</td>\n",
       "      <td>31.5</td>\n",
       "      <td>32.6</td>\n",
       "      <td>18.7</td>\n",
       "      <td>35.0</td>\n",
       "      <td>23.9</td>\n",
       "      <td>免费评估</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>501 rows × 11 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Ranking                                    University Name  \\\n",
       "0          1  麻省理工学院Massachusetts Institute of Technology (MIT)   \n",
       "1          2                           斯坦福大学Stanford University   \n",
       "2          3                             哈佛大学Harvard University   \n",
       "3          4  加州理工学院California Institute of Technology (Calt...   \n",
       "4          5                           牛津大学University of Oxford   \n",
       "..       ...                                                ...   \n",
       "496      493                               吉林大学Jilin University   \n",
       "497      498                      哈瓦那大学Universidad de La Habana   \n",
       "498      499                         阿尔卡拉大学niversidad de Alcalá   \n",
       "499      499                 东芬兰大学University of Eastern Finland   \n",
       "500      499                     俄勒冈州立大学Oregon State University   \n",
       "\n",
       "    Country/Region  Academic Reputation  Employer Reputation  Faculty Student  \\\n",
       "0       United Sta                100.0                100.0            100.0   \n",
       "1       United Sta                100.0                100.0            100.0   \n",
       "2       United Sta                100.0                100.0             98.6   \n",
       "3       United Sta                 97.0                 82.8            100.0   \n",
       "4       United Kin                100.0                100.0            100.0   \n",
       "..             ...                  ...                  ...              ...   \n",
       "496     China (Mai                 14.9                 23.1             49.9   \n",
       "497           Cuba                 16.0                 25.8             70.5   \n",
       "498          Spain                  9.8                 13.2             58.6   \n",
       "499        Finland                 14.7                  9.0             42.9   \n",
       "500     United Sta                 17.8                  7.3             31.5   \n",
       "\n",
       "    International Faculty International Students  Citations per Faculty  \\\n",
       "0                     100                   91.9                   99.1   \n",
       "1                    99.7                   63.6                   98.1   \n",
       "2                    85.2                   69.9                   99.1   \n",
       "3                     100                   88.2                   99.9   \n",
       "4                    99.4                   98.3                   81.3   \n",
       "..                    ...                    ...                    ...   \n",
       "496                  12.4                    3.2                   25.2   \n",
       "497                   2.4                    5.5                    2.4   \n",
       "498                  22.4                   70.6                   10.6   \n",
       "499                  25.8                    7.3                   33.5   \n",
       "500                  32.6                   18.7                   35.0   \n",
       "\n",
       "     Overall Score  Free  \n",
       "0            100.0  免费评估  \n",
       "1             98.4  免费评估  \n",
       "2             97.9  免费评估  \n",
       "3             97.0  免费评估  \n",
       "4             96.7  免费评估  \n",
       "..             ...   ...  \n",
       "496           24.2  免费评估  \n",
       "497           24.1  免费评估  \n",
       "498           23.9  免费评估  \n",
       "499           23.9  免费评估  \n",
       "500           23.9  免费评估  \n",
       "\n",
       "[501 rows x 11 columns]"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd \n",
    "import csv\n",
    "url1 = 'http://www.compassedu.hk/qs'\n",
    "df1 = pd.read_html(url1)[0]  #0表示网页中的第一个Table\n",
    "# df1.to_csv('世界大学综合排名.csv',index=0)\n",
    "df1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 抓取新浪财经基金重仓股数据（6页数据）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-03-27T14:53:48.201326Z",
     "start_time": "2021-03-27T14:53:47.045917Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "第1页抓取完成\n",
      "第2页抓取完成\n",
      "第3页抓取完成\n",
      "第4页抓取完成\n",
      "第5页抓取完成\n",
      "第6页抓取完成\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>代码</th>\n",
       "      <th>简称</th>\n",
       "      <th>截至日期</th>\n",
       "      <th>家数</th>\n",
       "      <th>本期持股数(万股)</th>\n",
       "      <th>持股占已流通A股比例(%)</th>\n",
       "      <th>同上期增减(万股)</th>\n",
       "      <th>持股比例(%)</th>\n",
       "      <th>上期家数</th>\n",
       "      <th>明细</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>600009</td>\n",
       "      <td>上海机场</td>\n",
       "      <td>2020-12-31</td>\n",
       "      <td>2</td>\n",
       "      <td>3207.7980</td>\n",
       "      <td>1.66</td>\n",
       "      <td>92.4934</td>\n",
       "      <td>1.62</td>\n",
       "      <td>2</td>\n",
       "      <td>+展开明细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>600030</td>\n",
       "      <td>中信证券</td>\n",
       "      <td>2020-12-31</td>\n",
       "      <td>1</td>\n",
       "      <td>18326.5129</td>\n",
       "      <td>1.42</td>\n",
       "      <td>1466.9900</td>\n",
       "      <td>1.30</td>\n",
       "      <td>1</td>\n",
       "      <td>+展开明细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>600062</td>\n",
       "      <td>华润双鹤</td>\n",
       "      <td>2020-12-31</td>\n",
       "      <td>1</td>\n",
       "      <td>244.5468</td>\n",
       "      <td>0.23</td>\n",
       "      <td>-20.2800</td>\n",
       "      <td>0.25</td>\n",
       "      <td>1</td>\n",
       "      <td>+展开明细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>600068</td>\n",
       "      <td>葛洲坝</td>\n",
       "      <td>2020-12-31</td>\n",
       "      <td>3</td>\n",
       "      <td>4526.2332</td>\n",
       "      <td>0.98</td>\n",
       "      <td>-2612.7310</td>\n",
       "      <td>1.55</td>\n",
       "      <td>4</td>\n",
       "      <td>+展开明细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>601975</td>\n",
       "      <td>招商南油</td>\n",
       "      <td>2020-12-31</td>\n",
       "      <td>1</td>\n",
       "      <td>1622.0343</td>\n",
       "      <td>0.33</td>\n",
       "      <td>-124.9100</td>\n",
       "      <td>0.35</td>\n",
       "      <td>1</td>\n",
       "      <td>+展开明细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>688021</td>\n",
       "      <td>奥福环保</td>\n",
       "      <td>2020-12-31</td>\n",
       "      <td>1</td>\n",
       "      <td>131.2360</td>\n",
       "      <td>1.70</td>\n",
       "      <td>-278.6177</td>\n",
       "      <td>5.30</td>\n",
       "      <td>6</td>\n",
       "      <td>+展开明细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>300677</td>\n",
       "      <td>英科医疗</td>\n",
       "      <td>2020-12-31</td>\n",
       "      <td>5</td>\n",
       "      <td>1963.6132</td>\n",
       "      <td>5.57</td>\n",
       "      <td>564.9432</td>\n",
       "      <td>6.29</td>\n",
       "      <td>4</td>\n",
       "      <td>+展开明细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>300738</td>\n",
       "      <td>奥飞数据</td>\n",
       "      <td>2020-12-31</td>\n",
       "      <td>2</td>\n",
       "      <td>449.9611</td>\n",
       "      <td>2.12</td>\n",
       "      <td>-396.7645</td>\n",
       "      <td>4.24</td>\n",
       "      <td>4</td>\n",
       "      <td>+展开明细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>688003</td>\n",
       "      <td>天准科技</td>\n",
       "      <td>2020-12-31</td>\n",
       "      <td>2</td>\n",
       "      <td>305.7298</td>\n",
       "      <td>1.58</td>\n",
       "      <td>305.7298</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0</td>\n",
       "      <td>+展开明细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>600933</td>\n",
       "      <td>爱柯迪</td>\n",
       "      <td>2020-12-31</td>\n",
       "      <td>2</td>\n",
       "      <td>2123.3462</td>\n",
       "      <td>2.47</td>\n",
       "      <td>-38.8767</td>\n",
       "      <td>2.52</td>\n",
       "      <td>3</td>\n",
       "      <td>+展开明细</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>240 rows × 10 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        代码    简称        截至日期  家数   本期持股数(万股)  持股占已流通A股比例(%)  同上期增减(万股)  \\\n",
       "0   600009  上海机场  2020-12-31   2   3207.7980           1.66    92.4934   \n",
       "1   600030  中信证券  2020-12-31   1  18326.5129           1.42  1466.9900   \n",
       "2   600062  华润双鹤  2020-12-31   1    244.5468           0.23   -20.2800   \n",
       "3   600068   葛洲坝  2020-12-31   3   4526.2332           0.98 -2612.7310   \n",
       "4   601975  招商南油  2020-12-31   1   1622.0343           0.33  -124.9100   \n",
       "..     ...   ...         ...  ..         ...            ...        ...   \n",
       "35  688021  奥福环保  2020-12-31   1    131.2360           1.70  -278.6177   \n",
       "36  300677  英科医疗  2020-12-31   5   1963.6132           5.57   564.9432   \n",
       "37  300738  奥飞数据  2020-12-31   2    449.9611           2.12  -396.7645   \n",
       "38  688003  天准科技  2020-12-31   2    305.7298           1.58   305.7298   \n",
       "39  600933   爱柯迪  2020-12-31   2   2123.3462           2.47   -38.8767   \n",
       "\n",
       "    持股比例(%)  上期家数     明细  \n",
       "0      1.62     2  +展开明细  \n",
       "1      1.30     1  +展开明细  \n",
       "2      0.25     1  +展开明细  \n",
       "3      1.55     4  +展开明细  \n",
       "4      0.35     1  +展开明细  \n",
       "..      ...   ...    ...  \n",
       "35     5.30     6  +展开明细  \n",
       "36     6.29     4  +展开明细  \n",
       "37     4.24     4  +展开明细  \n",
       "38     0.00     0  +展开明细  \n",
       "39     2.52     3  +展开明细  \n",
       "\n",
       "[240 rows x 10 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import csv\n",
    "df2 = pd.DataFrame()\n",
    "for i in range(6):\n",
    "    url2 = 'http://vip.stock.finance.sina.com.cn/q/go.php/vComStockHold/kind/jjzc/index.phtml?p={page}'.format(page=i+1)\n",
    "    df2 = pd.concat([df2,pd.read_html(url2)[0]])\n",
    "    print('第{page}页抓取完成'.format(page = i + 1))\n",
    "# df2.to_csv('./新浪财经数据.csv',encoding='utf-8',index=0)\n",
    "df2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 抓取证监会披露的IPO数据（50页数据）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-03-27T14:54:13.766414Z",
     "start_time": "2021-03-27T14:53:48.208898Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "第1页抓取完成\n",
      "第2页抓取完成\n",
      "第3页抓取完成\n",
      "第4页抓取完成\n",
      "第5页抓取完成\n",
      "第6页抓取完成\n",
      "第7页抓取完成\n",
      "第8页抓取完成\n",
      "第9页抓取完成\n",
      "第10页抓取完成\n",
      "第11页抓取完成\n",
      "第12页抓取完成\n",
      "第13页抓取完成\n",
      "第14页抓取完成\n",
      "第15页抓取完成\n",
      "第16页抓取完成\n",
      "第17页抓取完成\n",
      "第18页抓取完成\n",
      "第19页抓取完成\n",
      "第20页抓取完成\n",
      "第21页抓取完成\n",
      "第22页抓取完成\n",
      "第23页抓取完成\n",
      "第24页抓取完成\n",
      "第25页抓取完成\n",
      "第26页抓取完成\n",
      "第27页抓取完成\n",
      "第28页抓取完成\n",
      "第29页抓取完成\n",
      "第30页抓取完成\n",
      "第31页抓取完成\n",
      "第32页抓取完成\n",
      "第33页抓取完成\n",
      "第34页抓取完成\n",
      "第35页抓取完成\n",
      "第36页抓取完成\n",
      "第37页抓取完成\n",
      "第38页抓取完成\n",
      "第39页抓取完成\n",
      "第40页抓取完成\n",
      "第41页抓取完成\n",
      "第42页抓取完成\n",
      "第43页抓取完成\n",
      "第44页抓取完成\n",
      "第45页抓取完成\n",
      "第46页抓取完成\n",
      "第47页抓取完成\n",
      "第48页抓取完成\n",
      "第49页抓取完成\n",
      "共抓取 637 条记录,用时 0.43 分钟\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>公司名称</th>\n",
       "      <th>披露日期</th>\n",
       "      <th>上市板块</th>\n",
       "      <th>保荐机构</th>\n",
       "      <th>披露时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>赛赫智能设备（上海）股份有限公司</td>\n",
       "      <td>问询与回复</td>\n",
       "      <td>沪市科创板</td>\n",
       "      <td>国信证券股份有限公司</td>\n",
       "      <td>2021-03-26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>赛赫智能设备（上海）股份有限公司</td>\n",
       "      <td>问询与回复</td>\n",
       "      <td>沪市科创板</td>\n",
       "      <td>国信证券股份有限公司</td>\n",
       "      <td>2021-03-26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>苏州瑞博生物技术股份有限公司</td>\n",
       "      <td>问询与回复</td>\n",
       "      <td>沪市科创板</td>\n",
       "      <td>国泰君安证券股份有限公司</td>\n",
       "      <td>2021-03-26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>苏州瑞博生物技术股份有限公司</td>\n",
       "      <td>问询与回复</td>\n",
       "      <td>沪市科创板</td>\n",
       "      <td>国泰君安证券股份有限公司</td>\n",
       "      <td>2021-03-26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>苏州瑞博生物技术股份有限公司</td>\n",
       "      <td>问询与回复</td>\n",
       "      <td>沪市科创板</td>\n",
       "      <td>国泰君安证券股份有限公司</td>\n",
       "      <td>2021-03-26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>苏州明志科技股份有限公司</td>\n",
       "      <td>注册稿</td>\n",
       "      <td>沪市科创板</td>\n",
       "      <td>东吴证券股份有限公司</td>\n",
       "      <td>2021-02-04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>苏州明志科技股份有限公司</td>\n",
       "      <td>注册稿</td>\n",
       "      <td>沪市科创板</td>\n",
       "      <td>东吴证券股份有限公司</td>\n",
       "      <td>2021-02-04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>江苏迈信林航空科技股份有限公司</td>\n",
       "      <td>注册稿</td>\n",
       "      <td>沪市科创板</td>\n",
       "      <td>海通证券股份有限公司</td>\n",
       "      <td>2021-02-04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>江苏迈信林航空科技股份有限公司</td>\n",
       "      <td>注册稿</td>\n",
       "      <td>沪市科创板</td>\n",
       "      <td>海通证券股份有限公司</td>\n",
       "      <td>2021-02-04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>江苏迈信林航空科技股份有限公司</td>\n",
       "      <td>注册稿</td>\n",
       "      <td>沪市科创板</td>\n",
       "      <td>海通证券股份有限公司</td>\n",
       "      <td>2021-02-04</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>637 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                公司名称   披露日期   上市板块          保荐机构        披露时间\n",
       "1   赛赫智能设备（上海）股份有限公司  问询与回复  沪市科创板    国信证券股份有限公司  2021-03-26\n",
       "2   赛赫智能设备（上海）股份有限公司  问询与回复  沪市科创板    国信证券股份有限公司  2021-03-26\n",
       "3     苏州瑞博生物技术股份有限公司  问询与回复  沪市科创板  国泰君安证券股份有限公司  2021-03-26\n",
       "4     苏州瑞博生物技术股份有限公司  问询与回复  沪市科创板  国泰君安证券股份有限公司  2021-03-26\n",
       "5     苏州瑞博生物技术股份有限公司  问询与回复  沪市科创板  国泰君安证券股份有限公司  2021-03-26\n",
       "..               ...    ...    ...           ...         ...\n",
       "9       苏州明志科技股份有限公司    注册稿  沪市科创板    东吴证券股份有限公司  2021-02-04\n",
       "10      苏州明志科技股份有限公司    注册稿  沪市科创板    东吴证券股份有限公司  2021-02-04\n",
       "11   江苏迈信林航空科技股份有限公司    注册稿  沪市科创板    海通证券股份有限公司  2021-02-04\n",
       "12   江苏迈信林航空科技股份有限公司    注册稿  沪市科创板    海通证券股份有限公司  2021-02-04\n",
       "13   江苏迈信林航空科技股份有限公司    注册稿  沪市科创板    海通证券股份有限公司  2021-02-04\n",
       "\n",
       "[637 rows x 5 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from pandas import DataFrame\n",
    "import csv\n",
    "import time\n",
    "start = time.time() #计时\n",
    "df3 = DataFrame(data=None,columns=['公司名称','披露日期','上市板块','保荐机构','披露时间']) #添加列名\n",
    "for i in range(1,50):  \n",
    "    url3 ='http://eid.csrc.gov.cn/ipo/1010/index_%s.html'%str(i)\n",
    "    df3_1 = pd.read_html(url3,encoding='utf-8')[0]  #必须加utf-8，否则乱码\n",
    "    df3_2 = df3_1.iloc[1:len(df3_1)-1,0:-1]  #过滤掉最后一行和最后一列（NaN列）\n",
    "    df3_2.columns=['公司名称','披露日期','上市板块','保荐机构','披露时间'] #新的df添加列名\n",
    "    df3 = pd.concat([df3,df3_2])  #数据合并\n",
    "    print('第{page}页抓取完成'.format(page=i))\n",
    "# df3.to_csv('./上市公司IPO信息.csv', encoding='utf-8',index=0) #保存数据到csv文件\n",
    "end = time.time()\n",
    "print ('共抓取',len(df3),'条记录,' + '用时',round((end-start)/60,2),'分钟')\n",
    "df3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  },
  "nbTranslate": {
   "displayLangs": [
    "*"
   ],
   "hotkey": "alt-t",
   "langInMainMenu": true,
   "sourceLang": "en",
   "targetLang": "fr",
   "useGoogleTranslate": true
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
